
# Basic Libraries to import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy.stats import norm, skew
import os
from quickda.explore_data import *
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
pd.pandas.set_option('display.max_columns',None)
pd.pandas.set_option('display.max_rows',90)
pd.options.display.float_format = '{:.5f}'.format
# Setting seaborn style
sns.set_palette('Reds')
sns.set_style('darkgrid')
!pip -q install kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
api.competitions_list(category='gettingStarted')
api.competition_download_files('house-prices-advanced-regression-techniques')
from zipfile import ZipFile
zf = ZipFile('house-prices-advanced-regression-techniques.zip')
zf.extractall('data\\')
zf.close()
housePrices = pd.read_csv('./data/train.csv')
train_index = housePrices['Id']
target = housePrices['SalePrice']
target_col = 'SalePrice'
housePrices.head()
housePrices.shape
housePrices.info()
test_df = pd.read_csv('./data/test.csv')
test_index = test_df['Id']
test_df.head()
housePrices.shape, test_df.shape
from IPython.display import display
with pd.option_context('display.max_rows', 100, 'display.max_columns', 100):
display(explore(housePrices, method="summarize"))
df = pd.concat((housePrices, test_df))
indexs = df['Id']
df.drop(['SalePrice', 'Id'],axis = 1, inplace = True)
df.shape, housePrices.shape
df.head()
percent_missing = round(df.isnull().sum() * 100 / len(df),3)
missing_value_df = pd.DataFrame({'column_name': df.columns,
'percent_missing': percent_missing})
missing_value_df[missing_value_df['percent_missing']>0].sort_values(by='percent_missing', ascending= False )
colToReplace = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
for i in colToReplace:
df[i] = df[i].fillna('NotAvailable')
df['PoolQC'].value_counts()
round(df[colToReplace].isnull().sum() * 100 / len(df),3)
Houses with no Garage are missing. Hence replacing missing with 'NoGarage'
garageCols = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond','GarageYrBlt', 'GarageArea', 'GarageCars']
for i in garageCols[0:4]:
df[i] = df[i].fillna('NoGarage')
round(df[garageCols].isnull().sum() * 100 / len(df),3)
for i in garageCols[4:]:
df[i] = df[i].fillna(0)
Houses with no basement are missing. Hence replacing missing with 'NoBasement'
basementCols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2']
round(df[basementCols].isnull().sum() * 100 / len(df),3)
for i in basementCols:
df[i] = df[i].fillna('NoBasement')
round(df[basementCols].isnull().sum() * 100 / len(df),3)
for i in ['BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath','BsmtHalfBath']:
df[i] = df[i].fillna(0)
df['MasVnrType'] = df['MasVnrType'].fillna('None')
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
df['Functional'] = df['Functional'].fillna('Typ')
df['Neighborhood'].value_counts()
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
percent_missing = round(df.isnull().sum() * 100 / len(df),3)
missing_value_df = pd.DataFrame({'column_name': df.columns,
'percent_missing': percent_missing})
missing_value_df[missing_value_df['percent_missing']>0].sort_values(by='percent_missing', ascending= False )
cols = ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'SaleType', 'Electrical']
for i in cols:
df[i] = df[i].fillna(df[i].mode()[0])
plt.figure(figsize=(7,5))
sns.heatmap(df.isnull(), cbar=True, cmap= 'YlGnBu')
categorical_cols = df.dtypes[ df.dtypes == 'object' ].index.tolist()
numerical_cols = df.dtypes[ df.dtypes != 'object' ].index.tolist()
print("Total Number of Features:", df.shape[1])
print("Total Number of Categorical Features:", len(categorical_cols))
print("Total Number of Numerical Features:", len(numerical_cols))
# Custom Encoding
MSSubclass_map = {'20':1,'30':2,'40':3,'45':4,'50':5,'60':6,'70':7,'75':8,'80':9,
'85':10, '90':11,'120':12,'150':13,'160':14,'180':15,'190':16}
LotShape = {'Reg':3,'IR1':2,'IR2':1,'IR3':0}
utilities_map = {'AllPub':4, 'NoSewr':3, 'NoSeWa':2, 'ELO':1}
quality_map = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1, 'NoBasement':0, 'NotAvailable':0, 'NoGarage':0}
basement_exp = {'NoBasement':0,'No':1,'Mn':2,'Av':3,'Gd':4}
bsmtfin = {'NoBasement':0,'Unf':1,'LwQ':2, 'Rec':3,'BLQ':4,'ALQ':5, 'GLQ':6}
centralAir = {'N':0, 'Y':1}
Functional_map = {'Sal':0,'Sev':1,'Maj2':2,'Maj1':3,'Mod':4,'Min2':5,'Min1':6,'Typ':7}
GarageFinish = {'NoGarage':0,'Unf':1,'RFn':2,'Fin':3}
fence_qc = {'GdPrv':4, 'MnPrv':3, 'GdWo':2, 'MnWw':1, 'NotAvailable':0}
q = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'HeatingQC']
for i in q:
df[i] = df[i].replace(quality_map)
df['MSSubClass'] = df['MSSubClass'].apply(str).replace(MSSubclass_map)
df['LotShape'] = df['LotShape'].replace(LotShape)
df['Utilities'] = df['Utilities'].replace(utilities_map)
df['BsmtExposure'] = df['BsmtExposure'].replace(basement_exp)
df['BsmtFinType1'] = df['BsmtFinType1'].replace(bsmtfin)
df['BsmtFinType2'] = df['BsmtFinType2'].replace(bsmtfin)
df['CentralAir'] = df['CentralAir'].replace(centralAir)
df['Functional'] = df['Functional'].replace(Functional_map)
df['GarageFinish'] = df['GarageFinish'].replace(GarageFinish)
df['Fence'] = df['Fence'].replace(fence_qc)
#Creating a feature -> If house is remodelled then 1 else 0
df['Remodel'] = np.where(df['YearRemodAdd']==df['YearBuilt'], 1, 0)
df.drop('YearRemodAdd', axis=1, inplace=True)
#Transforming variable to age
df['HouseAge'] = 2020 - df['YearBuilt']
df.drop('YearBuilt', axis=1, inplace=True)
#Transforming variable to age
df['GarageYrBlt'] = 2020 - df['GarageYrBlt']
df.drop('GarageYrBlt', axis=1, inplace=True)
df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath'])+df['BsmtFullBath']+(0.5 * df['BsmtHalfBath']))
df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +df['EnclosedPorch'] +
df['ScreenPorch'] + df['WoodDeckSF'])
# Import library for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):
# Calculating VIF
vif = pd.DataFrame()
vif["variables"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
return(vif)
def annote_graph(graph):
for p in graph.patches:
height = p.get_height()
graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
temp_train = df.iloc[0:len(train_index)]
temp_train = pd.concat((temp_train, target), axis=1)
sns.pairplot(data=temp_train,
y_vars=['SalePrice'],
x_vars=['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath'])
X = temp_train[['TotalBsmtSF','2ndFlrSF', 'BsmtFullBath', '1stFlrSF']]
calc_vif(X)
sns.pairplot(data=temp_train,
y_vars=['SalePrice'],
x_vars=['Total_Bathrooms', 'FullBath', 'HalfBath', 'BsmtHalfBath'])
X = temp_train[['Total_Bathrooms', 'FullBath', 'HalfBath', 'BsmtHalfBath']]
calc_vif(X)
sns.pairplot(data=temp_train,
y_vars=['SalePrice'],
x_vars=['OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch', 'WoodDeckSF'])
X = temp_train[['OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch','WoodDeckSF']]
calc_vif(X)
fig, ax =plt.subplots(2,2, figsize=(10,10))
graph = sns.countplot(temp_train['PoolQC'], ax=ax[0,0])
annote_graph(graph)
graph = sns.countplot(temp_train['Utilities'], ax=ax[0,1])
annote_graph(graph)
graph = sns.countplot(temp_train['Street'], ax=ax[1,0])
annote_graph(graph)
fig.show()
fig.delaxes(ax[1][1])
cols_to_remove = ['PoolQC', 'Utilities', 'Street', 'FullBath', '1stFlrSF']
df.drop(cols_to_remove, axis=1, inplace=True)
df.head()
df.info()
sns.distplot(target, color='blue')
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(target))
print(z)
threshold = 3
outlier_index = np.where(z > 3)[0]
print("Number of outliers based on z-score:", len(outlier_index))
import ppscore as pps
temp_train = df.iloc[0:len(train_index)]
temp_train = pd.concat((temp_train, target), axis=1)
temp_train.head()
col = temp_train.columns#.score will be a column in the matrix below
train_pps = temp_train[col]
pps_mat = pps.matrix(train_pps)
pps_mat[pps_mat['x'] == 'SalePrice'].sort_values(by='ppscore', ascending=False)
# Selecting features only with ppscore > 0
pps_features = pps_mat.loc[(pps_mat['x'] == 'SalePrice') & (pps_mat['ppscore'] > 0)]['y'].values
print("Number of features using PPS: ",len(pps_features)), pps_features
def create_train_test(df, features, trainIndex):
df = df[features[:-1]]
categorical_cols = df.dtypes[ df.dtypes == 'object' ].index.tolist()
numerical_cols = df.dtypes[ df.dtypes != 'object' ].index.tolist()
cat_dummies = pd.get_dummies(df[categorical_cols], drop_first=True)
df = df.drop(categorical_cols, axis=1)
df = pd.concat([df, cat_dummies], axis=1)
train = df.iloc[0:len(trainIndex)]
test = df.iloc[len(trainIndex):]
print('Shape of Train Dataset:',train.shape)
print('Shape of Test Dataset:',test.shape)
return(train,test)
# Train and Test data with all the features
train_all, test_all = create_train_test(df, df.columns, train_index)
# Train and Test data with features selected by predictive power score
train_pps, test_pps = create_train_test(df, pps_features, train_index)
from sklearn.preprocessing import StandardScaler
def scaler_function(train_data, target):
scaler = StandardScaler()
scaler.fit(train_data)
train_std = scaler.transform(train_data)
train = pd.DataFrame(train_std, columns=train_data.columns)
train = pd.concat((train,target),axis=1)
return scaler, train
scaler_all, train_scaled_all = scaler_function(train_all, target)
scaler_pps, train_scaled_pps = scaler_function(train_pps, target)
train_scaled_pps.shape, train_scaled_all.shape
from hpsklearn import HyperoptEstimator
from hpsklearn import xgboost_regression, gradient_boosting_regression, extra_trees_regression, ada_boost_regression
from hyperopt import tpe
model_list = [xgboost_regression, gradient_boosting_regression, extra_trees_regression, ada_boost_regression]
model_names = ['xgBoost', 'gradBoost', 'extraTree', 'adaBoost']
from sklearn import metrics
def model_eval(model, y_true, y_pred):
mae = metrics.mean_absolute_error(y_true.values, y_pred)
mse = metrics.mean_squared_error(y_true.values, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_true.values, y_pred))
r2score = metrics.r2_score(y_true.values, y_pred)
scores = [mse,rmse, mae,r2score]
return([round(x,5) for x in scores])
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pickle
import os
from pathlib import Path
def modeling(df, target_col, model_list, model_names, test_size=0.3, folder='model_dir', save_models=False):
folder = 'Models/'+folder
p = Path(folder)
p.mkdir(parents=True, exist_ok=True)
# evaluating scores -> r2, rmse, mae
train_scores = []
valid_scores = []
features = df.columns.tolist()
features.remove(target_col)
X = df[features]
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, test_size=test_size)
for algo, name in zip(model_list,model_names):
model = HyperoptEstimator(regressor=algo(name),
preprocessing=[],
max_evals=N_ITER,
loss_fn=mean_absolute_error,
#algo=tpe.suggest,
trial_timeout=60,
seed=RANDOM_STATE)
model.fit(X_train.values, y_train.values)
#--------- Train Data Evaluation ---------
predictions = model.predict(X_train.values)
scores = model_eval(model= model, y_true= y_train, y_pred= predictions)
train_scores.append(scores)
#--------- Test Data Evaluation ---------
predictions = model.predict(X_test.values)
scores = model_eval(model= model, y_true= y_test, y_pred= predictions)
valid_scores.append(scores)
if save_models:
#print(model.best_model())
pkl_filename = name+'.pkl'
with open(p/pkl_filename, 'wb') as file:
pickle.dump(model, file)
#joblib.dump(model, filename)
train_score_df = pd.DataFrame(train_scores, columns=['mse','rmse', 'mae','r2'], index= model_names)
valid_score_df = pd.DataFrame(valid_scores, columns=['mse','rmse', 'mae','r2'], index= model_names)
return (train_score_df, valid_score_df)
RANDOM_STATE = 42
N_ITER = 40
tr_df, val_df = modeling(df = train_scaled_all,
target_col = target_col,
model_list = model_list,
model_names = model_names,
folder = 'fullData'
)
tr_df
val_df
RANDOM_STATE = 42
N_ITER = 40
tr_df_pps, val_df_pps = modeling(df=train_scaled_pps, target_col=target_col,
model_list=model_list, model_names=model_names, folder='pps')
tr_df_pps
val_df_pps
# Loading the best performing model -> xgBoost
with open('./Models/fullData/xgBoost.pkl', 'rb') as f:
xgBoost = pickle.load(f)
xgBoost.best_model()
n_feat = housePrices.shape[1]-1
n_feat_pps = len(pps_features)-1
print("Number of Independent Features in ORIGINAL dataset", n_feat)
print("Number of Independent Features selected by Predictive Power Score", n_feat_pps)
print("Percentage of features omitted using PPS: ", (n_feat-n_feat_pps)/n_feat * 100)
In case of an regression, the ppscore uses the mean absolute error (MAE) as the underlying evaluation metric (MAE_model).
The best possible score of the MAE is 0 and higher is worse.
As a baseline score, we calculate the MAE of a naive model (MAE_naive) that always predicts the median of the target column.
The PPS is the result of the following normalization (and never smaller than 0):
- MAE = 15884.68449
- R-squared = 0.90460
Metrics for the xgBoost model with features selected by PPS :
- MAE = 17013.10227
- R-squared = 0.90928
The performance of the model is similar and we have elimated 67.5% of features using PPS.
test_all.head(3)
def test_submissions(model, test_data, scaler, model_name, save=False):
test_cols = test_data.columns
test_std = scaler.transform(test_data)
predictions = model.predict(test_std)
d = {'Id': test_index, 'SalePrice': np.round(predictions,3)}
submission = pd.DataFrame(data=d)
if save:
submission.to_csv(('submission_'+model_name+'.csv'),index=False)
return submission
with open('./Models/fullData/xgBoost.pkl', 'rb') as f:
xgBoost_all = pickle.load(f)
xgBoost_all.best_model()
test_all_predictions = test_submissions(xgBoost_all, test_all, scaler_all, 'xgBoost_test', False)
sns.boxplot(test_all_predictions, showmeans=True)
test_all_predictions.to_csv('submission_all.csv', index=False)
api.competition_submit('submission_all.csv','All features','house-prices-advanced-regression-techniques')
with open('./Models/pps/xgBoost.pkl', 'rb') as f:
xgBoost_pps = pickle.load(f)
xgBoost_pps.best_model()
test_pps_predictions = test_submissions(xgBoost_pps, test_pps, scaler_pps, 'xgBoost_pps', False)
sns.boxplot(test_pps_predictions, showmeans=True)
test_pps_predictions.to_csv('submission_pps.csv', index=False)
api.competition_submit('submission_pps.csv','PPS features','house-prices-advanced-regression-techniques')